Simulating an integration profile

Let’s say we want to generate numerical values that satisfy the inequalities corresponding to a given profile. This can be done using the code below.

Simulating mean expression for a given integration profile

source("compute_profile_means.R")
source("compute_minimum_delta.R")
source("setPowerPointStyle.R")
setPowerPointStyle()
## Warning in par(tmag = 1.2): graphical parameter "tmag" is obsolete
PROFCODES = read.table("profile_codes_v2.txt",header = TRUE,sep = "\t") #contains the definitions of all profiles
load("constraints_vector")
prof_index = 3 #index of profile to be simulated (corresponds to an emergent synergy)
ntimes = 5 #n. of simulations
exp_min = 2 #min range of expression value
exp_max = 16 #max range of expression value
min_delta = 1 #minimum non-zero difference among any two comparisons
profile_means = compute_profile_means(PROFCODES, prof_index, ntimes, exp_min, 
                                      exp_max, constraints_vector, min_delta)[ , 1:4]

head(profile_means)
##                                         
## [1,] 5.050826 5.050826 5.050826 7.191936
## [2,] 2.275213 2.275213 2.275213 4.760853
## [3,] 6.952922 6.952922 6.952922 9.968298
## [4,] 4.779405 4.779405 4.779405 6.597176
## [5,] 6.457617 6.457617 6.457617 8.830367

Plotting one example

source("setPowerPointStyle.R")
setPowerPointStyle()

colnames(profile_means)=c("0","X","Y","X+Y")
barplot(profile_means[1,], ylab = 'simulated expression')
add = profile_means[1,1] + (profile_means[1,2] - profile_means[1,1]) + (profile_means[1,3] - profile_means[1,1])
abline(h = add, col="red")

Simulating random samples for a given inegration profile

After computing the means for a given profile, we can generate random samples resembling real data. We assume that real data come from normal distributions centered around the means computed above. The standard deviation is passed as a parameter, so it is possible to simulate arbitrary noise levels.

source("simulate_from_means.R")
source("setPowerPointStyle.R")
setPowerPointStyle()

samples = 4 #number of samples for each condition that will be simulated

noise_level = 0.5 #this means that the signal-to-noise is delta/noise_level = 1/0.5

design = factor(c(rep("0", samples), rep("X", samples), rep("Y", samples), rep("Y+X", samples)))

simulated_values = simulate_from_means(profile_means[1,], prof_index, samples, noise_level, exp_min, exp_max)
names(simulated_values) = design

boxplot(simulated_values ~ design, ylab = 'simulated expression', col = 'gray')

stripchart(simulated_values ~ design, vertical = TRUE, 
    method = "jitter", add = TRUE, pch = 20, col = 'black',cex=1.5)

Example with more noise.

source("setPowerPointStyle.R")
setPowerPointStyle()

noise_level = 1 #this means that the signal-to-noise is delta/noise_level = 1/1

simulated_values = simulate_from_means(profile_means[1,], prof_index, samples, noise_level, exp_min, exp_max)

boxplot(simulated_values ~ design, ylab = 'simulated expression', col = 'gray')

stripchart(simulated_values ~ design, vertical = TRUE, 
    method = "jitter", add = TRUE, pch = 20, col = 'black',cex=1.5)

Extracting statistical features from a noisy input

Let’s assume we have a noisy input in the form of N replicates of 0,X, Y, X+Y. We derive a set of statistical features, which will be used as predictors of the true profile in the classifier. Such features consist of: the Bliss index, the mean expression values in 0,X, Y, X+Y, and the p-values for all possible pairwise tests. We consider both one-tailed, and two-tailed tests of t-test and Wilkoxon. In total, the are 75 variables. These statistical features are computed with the function match11 as shown below.

source("extract_stat_features.R")

profile_features = extract_stat_features(simulated_values)
length(profile_features)
## [1] 75
head(profile_features, 10)
##       Bliss          X0           X           Y         Y.X Delta_ADD.0 
##  2.99798415  5.21182860  5.25096713  4.26028845  7.29741114 -0.91240161 
##   Delta_X.0   Delta_Y.0 Delta_X.Y.0 Delta_X.ADD 
##  0.03913854 -0.95154015  2.08558254  0.95154015

Generating a training set with instances of all profiles and variable noise level

The classifier is based on a training set consisting of simulated data. Let’s see how we can use the above functions to generate a training set containing all profiles.

#source("generate_training_set.R")

#samples = 4 #n. of samples per condition
#ntimes = 10 #n. of simulations for each profile (should be >1000 for a good training set)

#big_simulation = generate_training_set(samples, ntimes)

#the resulting dataframe has samples*ntimes*5 rows and 76 columns
#dim(big_simulation)

#save(big_simulation, file = "big_simulation")

Training the random forest classifier with simulated data

This is the code to train the random forest model. It takes a few hours with a large training set. Do not uncomment these lines if the random forest model is already present in the directory: it will be overwritten.

#library(randomForest)

#load("big_simulation")
#rf_model=randomForest(as.factor(TCIND)~., data = big_simulation,importance=T)

#save("rf_model",file = "rf_model")

Loading data

The data file should have the first two columns with annotation (for example probe ID and gene Symbol). Numerical data start from column 3.

data_file = "TNF_IFN_2.csv"
my_data = read.csv(data_file,sep = '\t')
head(my_data)
##   probe  gene     CTRL   CTRL.1    CTRL.2  IFN.2.5 IFN.2.5.1 IFN.2.5.2
## 1  RFC2  RFC2 8.324769 8.324769  8.132764 8.375789  8.373458  8.324769
## 2 HSPA6 HSPA6 5.413300 4.830096  5.929786 8.329249  8.006876  7.950832
## 3  PAX8  PAX8 5.594199 5.581641  5.581641 5.661038  5.581641  5.581641
## 4  UBA7  UBA7 7.361664 7.307654  7.336399 7.336399  7.217897  7.197828
## 5  CCL5  CCL5 9.985708 8.999638 10.371103 4.692704  4.517400  4.996733
## 6 ESRRA ESRRA 6.335024 6.373706  6.539027 6.630504  6.847058  6.778308
##     TNF.2.5 TNF.2.5.1 TNF.2.5.2 TNF.IFN.2.5 TNF.IFN.2.5.1 TNF.IFN.2.5.2
## 1  8.324769  8.462647  8.386247    8.324769      8.338911      8.192397
## 2  6.248139  6.207699  5.940508    7.677756      7.484339      7.540568
## 3  5.581641  5.581641  5.581641    5.504055      5.411538      5.545741
## 4  7.671212  7.607462  7.470790    7.420842      7.336399      7.095814
## 5 12.476331 12.450789 12.326590    6.663396      5.642317      6.052862
## 6  6.155354  6.370346  6.397341    6.975653      6.886920      6.539027
#get numeric data
expression_data = my_data[,-(1:2)]

Preprocessing data

if (max(expression_data)>25) expression_data = log2(expression_data)

#removing uninformative probes (very small coefficient of variation)
cof_cutoff = 0.05

cof = apply(expression_data, 1, function(x) sd(x)/mean(x))

cof_filter = which(cof > cof_cutoff)

Checking data quality with PCA

#graphical parameters
source("setPowerPointStyle.R")
setPowerPointStyle()

my.pca <- prcomp(t(expression_data[cof_filter, ]), center = TRUE, scale=TRUE)

#we assume the same number of samples for each condition
samples = ncol(expression_data)/4

cols = c(rep("black", samples), rep("red", samples),
         rep("blue", samples), rep("yellow", samples))

plot(my.pca$x[, 1], my.pca$x[, 2], col = cols,
     xlab = "PC1", ylab = "PC2", pch = 20, cex = 1.5, main = data_file)

legend("bottomleft", pch = 20, col = unique(cols), 
       legend = c("0","X","Y","X+Y"), bty = 'n',cex = 1)

Filtering based on minimum group differences

source("filter_data_on_deltas.R")
design = factor(c(rep("0",samples),rep("X",samples),
                  rep("Y",samples),rep("Y+X",samples)))


my_data_filtered = filter_data_on_deltas(my_data, design = design)
 
head(my_data_filtered)
##      probe    gene     CTRL   CTRL.1    CTRL.2  IFN.2.5 IFN.2.5.1
## 2    HSPA6   HSPA6 5.413300 4.830096  5.929786 8.329249  8.006876
## 5     CCL5    CCL5 9.985708 8.999638 10.371103 4.692704  4.517400
## 10     PXK     PXK 9.294279 9.508841  9.240162 9.846890 10.014370
## 11 MSANTD3 MSANTD3 9.590901 9.274737  9.831871 7.027361  7.053774
## 12 SLC46A1 SLC46A1 6.048298 6.617517  6.384521 7.951326  7.867392
## 14    PIGX    PIGX 9.020155 8.955406  9.036642 8.261126  8.799131
##    IFN.2.5.2   TNF.2.5 TNF.2.5.1 TNF.2.5.2 TNF.IFN.2.5 TNF.IFN.2.5.1
## 2   7.950832  6.248139  6.207699  5.940508    7.677756      7.484339
## 5   4.996733 12.476331 12.450789 12.326590    6.663396      5.642317
## 10 10.093442  8.939042  9.193210  8.997016    9.972881     10.101698
## 11  7.320024  8.754430  8.966617  8.826732    7.349552      7.153825
## 12  7.562287  6.122274  6.428453  6.443509    8.016323      7.685079
## 14  8.650005  8.832036  8.905883  8.848334    9.028034      9.129482
##    TNF.IFN.2.5.2
## 2       7.540568
## 5       6.052862
## 10     10.142173
## 11      7.248851
## 12      7.787337
## 14      8.781471

Random forest classifier

#source("find_optimal_match.R")

#my_data_filtered_matched = find_optimal_match(my_data_filtered)

#head(my_data_filtered_matched)

Visualizing all integration profiles with frequency plots

#source("visualize_all_profiles.R")
#source("setPowerPointStyle.R")
#setPowerPointStyle()

#visualize_all_profiles(my_data_filtered_matched)

Appendix: plotting all profiles

source("generate_all_profiles.R")
generate_all_profiles()